On cherche à étudier l’effet de trois facteurs sur le transcriptome des racines d’Arabidopsis thaliana. Le CO2, au cours des études préliminaires, s’est montré peu influent en conditions contrôles de fer et de nitrates, et accentué en cas de stress nutritionnel. Nous reprennons ces résultats avec des fonctions génériques et propres pour en faire le résumé et de jolis graphes.
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## number of items read is not a multiple of the number of columns
On a, pour chaque gène et chaque condition, son niveau d’expression en sortie de quantification. On labelle les conditions avec le code suivant : lettre majuscule pour le niveau fort, minuscule pour le niveau faible. Le réplicat est donné après l’underscore.
# quantification file
data <- read.csv("quantifFiles/QuantifGenesTomate.csv", h = T, sep = ",")
rownames(data) <- data$Gene
genes = which(!(grepl("__", rownames(data))))
not_quant = data[which((grepl("__", rownames(data)))), ]
data = data[genes, grepl("R", colnames(data))]
getLabel("R6")[1] "cnF_3"
[1] At_AmbientCO2_LowNitrate_Fe1
48 Levels: At_AmbientCO2_HighNitrate_Fe1 ... Sl_ElevatedCO2_LowNitrate_FeStarvation3
[1] "At_AmbientCO2_LowNitrate_Fe"
keep <- rowSums(data) >= 10
data <- data[keep, ]
group <- sapply(colnames(data), getLabel, with.rep = F)
colnames(data) <- sapply(colnames(data), getLabel)
head(data) Cnf_3 cNf_3 cNf_2 cnF_1 cnF_2 cnF_3 cNF_1 CNF_1 cNF_3 CNF_3 CNF_2
Sly00g0382751 400 320 259 280 464 370 228 321 282 348 261
Sly00g0382761 1326 755 910 1026 1429 1303 665 1030 771 1111 943
Sly00g0382831 0 1 0 0 1 0 0 0 1 4 0
cNF_2 CnF_3 cNf_1 CnF_2 CnF_1 Cnf_2 Cnf_1 CNf_2 CNf_3 cnf_2 cnf_1
Sly00g0382751 302 309 306 264 238 312 269 300 223 552 342
Sly00g0382761 882 1467 912 987 1017 1063 1146 825 791 1392 1000
Sly00g0382831 0 0 0 0 0 0 0 0 1 2 0
cnf_3 CNf_1
Sly00g0382751 401 216
Sly00g0382761 1049 687
Sly00g0382831 0 3
[ reached 'max' / getOption("max.print") -- omitted 3 rows ]
[1] 26984 24
On définit les conditions contrôle comme suit : fort nitrate et fort fer.
method = "edger"
g = list()
# reference condition as first element of labels
labels <- c("cNF", "CNF")
genes1 <- dualDE(data, labels, pval = 0.01, method = method) (Intercept) groupCNF
1 1 0
2 1 0
3 1 0
4 1 1
5 1 1
6 1 1
attr(,"assign")
[1] 0 1
attr(,"contrasts")
attr(,"contrasts")$group
[1] "contr.treatment"
[1] "cNF_1" "cNF_3" "cNF_2" "CNF_1" "CNF_3" "CNF_2"
cNF_1 cNF_3 cNF_2 CNF_1 CNF_3 CNF_2
0.9224235 1.0094600 1.0262973 1.0164815 1.0169210 1.0084167
[1] "3565 genes DE"
[1] 1298
[1] "2408not in the Micro-Tom annotation..."
(Intercept) groupCnF
1 1 0
2 1 0
3 1 0
4 1 1
5 1 1
6 1 1
attr(,"assign")
[1] 0 1
attr(,"contrasts")
attr(,"contrasts")$group
[1] "contr.treatment"
[1] "cnF_1" "cnF_2" "cnF_3" "CnF_3" "CnF_2" "CnF_1"
cnF_1 cnF_2 cnF_3 CnF_3 CnF_2 CnF_1
0.9634586 0.9892413 0.9538002 1.0362710 1.0349523 1.0222766
[1] "5645 genes DE"
[1] "2408not in the Micro-Tom annotation..."
(Intercept) groupCNf
1 1 0
2 1 0
3 1 0
4 1 1
5 1 1
6 1 1
attr(,"assign")
[1] 0 1
attr(,"contrasts")
attr(,"contrasts")$group
[1] "contr.treatment"
[1] "cNf_3" "cNf_2" "cNf_1" "CNf_2" "CNf_3" "CNf_1"
cNf_3 cNf_2 cNf_1 CNf_2 CNf_3 CNf_1
1.0570743 1.0153384 1.0345625 0.9668619 0.9884527 0.9377102
[1] "3319 genes DE"
[1] "2408not in the Micro-Tom annotation..."
(Intercept) groupCnf
1 1 0
2 1 0
3 1 0
4 1 1
5 1 1
6 1 1
attr(,"assign")
[1] 0 1
attr(,"contrasts")
attr(,"contrasts")$group
[1] "contr.treatment"
[1] "cnf_2" "cnf_1" "cnf_3" "Cnf_3" "Cnf_2" "Cnf_1"
cnf_2 cnf_1 cnf_3 Cnf_3 Cnf_2 Cnf_1
0.9840426 1.0167156 0.9810618 1.0462304 0.9859792 0.9859704
[1] "1813 genes DE"
[1] "2408not in the Micro-Tom annotation..."
On visualise les gènes différentiellement exprimés en commun entre les différents niveaux des autres facteurs.
library(ggVennDiagram)
library(VennDiagram)
gene_list <- list()
for (comp in names(g)) {
print(g[[comp]])
gene_list[[comp]] <- g[[comp]]$gene_id
} gene_id a.value m.value p.value q.value rank
1 Sly10g0177641 8.237314 4.936051 2.648643e-106 7.147099e-102 1
2 Sly04g0238001 8.329990 -3.820946 4.384390e-99 5.915419e-95 2
3 Sly06g0378401 8.863428 -3.715993 1.307763e-88 1.176289e-84 3
4 Sly09g0101621 8.306644 4.979690 9.313548e-86 6.282919e-82 4
5 Sly05g0159891 9.154064 -3.103287 2.718927e-73 1.467350e-69 5
6 Sly12g0052261 8.991532 -3.836117 6.699226e-67 3.012865e-63 6
7 Sly01g0030441 9.307190 -3.552397 3.200801e-66 1.233863e-62 7
8 Sly03g0221711 8.738827 -4.423717 4.544116e-60 1.532730e-56 8
9 Sly01g0015771 10.817456 -2.842569 2.606695e-57 7.815452e-54 9
estimatedDEG upreg
1 1 1
2 1 0
3 1 0
4 1 1
5 1 0
6 1 0
7 1 0
8 1 0
9 1 0
[ reached 'max' / getOption("max.print") -- omitted 3556 rows ]
gene_id a.value m.value p.value q.value rank
1 Sly07g0128951 13.334303 -3.366047 8.444524e-220 2.278670e-215 1
2 Sly09g0080061 8.086623 4.359561 3.153963e-137 4.255326e-133 2
3 Sly07g0125765 8.154734 5.694766 1.281319e-116 1.152504e-112 3
4 Sly10g0186231 14.206179 -2.909205 4.002934e-108 2.700379e-104 4
5 Sly04g0238001 7.878082 -5.096610 2.558700e-106 1.380879e-102 5
6 Sly01g0030441 10.466527 -4.452471 3.608437e-104 1.622834e-100 6
7 Sly03g0209511 10.006146 -2.995633 5.337549e-103 2.057549e-99 7
8 Sly06g0374921 10.468673 2.802062 2.806991e-96 9.467982e-93 8
9 Sly02g0345201 11.514050 -2.995710 2.236747e-93 6.706266e-90 9
estimatedDEG upreg
1 1 0
2 1 1
3 1 1
4 1 0
5 1 0
6 1 0
7 1 0
8 1 1
9 1 0
[ reached 'max' / getOption("max.print") -- omitted 5636 rows ]
gene_id a.value m.value p.value q.value rank estimatedDEG
1 Sly01g0003001 9.263630 -2.686212 4.219440e-46 1.138574e-41 1 1
2 Sly03g0195711 9.251653 2.835068 3.638053e-45 4.908461e-41 2 1
3 Sly12g0048931 7.461099 4.186648 8.781332e-42 7.898515e-38 3 1
4 Sly03g0220701 8.666161 -2.661509 2.824106e-40 1.905142e-36 4 1
5 Sly01g0043921 8.392239 3.252816 2.198928e-38 1.186718e-34 5 1
6 Sly02g0320781 10.126331 -2.717985 6.899784e-38 3.103063e-34 6 1
7 Sly03g0224601 8.093669 -3.169575 1.078629e-36 4.157961e-33 7 1
8 Sly03g0212331 8.477394 -3.273001 1.645800e-36 5.551284e-33 8 1
9 Sly03g0226671 13.983601 3.239270 3.708534e-35 1.111901e-31 9 1
upreg
1 0
2 1
3 1
4 0
5 1
6 0
7 0
8 0
9 1
[ reached 'max' / getOption("max.print") -- omitted 3310 rows ]
gene_id a.value m.value p.value q.value rank estimatedDEG
1 Sly11g0287211 6.911218 4.118103 9.201428e-48 2.482913e-43 1 1
2 Sly06g0363881 9.153901 -2.491356 1.363118e-42 1.839118e-38 2 1
3 Sly01g0000321 10.497482 1.976183 1.412133e-36 1.270166e-32 3 1
4 Sly11g0310491 11.577950 2.862892 9.414796e-35 6.351221e-31 4 1
5 Sly06g0370891 11.208060 -2.302951 8.440313e-30 4.555068e-26 5 1
6 Sly01g0042251 12.240116 1.869367 5.987577e-29 2.692813e-25 6 1
7 Sly07g0109941 11.432100 1.928599 1.963146e-28 7.567646e-25 7 1
8 Sly12g0054281 9.087987 -2.714713 7.903911e-28 2.665989e-24 8 1
9 Sly07g0110001 11.839618 1.866840 2.851457e-27 8.549303e-24 9 1
upreg
1 1
2 0
3 1
4 1
5 0
6 1
7 1
8 0
9 1
[ reached 'max' / getOption("max.print") -- omitted 1804 rows ]
partitions <- get.venn.partitions(gene_list, keep.elements = T)
partitions$shared <- rowSums(partitions[1:4])
partitions <- partitions[order(-partitions$shared), ]
sharedBy3 <- unique(unlist(subset(partitions, partitions$shared == 3)$..values..))
a <- OntologyProfile(sharedBy3, specie = specie)[1] "2408not in the Micro-Tom annotation..."
Solyc Sly
4508 Solyc04g012050 Sly04g0238001
6146 Solyc05g054820 Sly05g0159891
5409 Solyc04g074950 Sly04g0251691
1420 Solyc01g007990 Sly01g0003001
1005 Solyc12g096890 Sly12g0074121
910 Solyc12g055710 Sly12g0068931
8731 Solyc10g007280 Sly10g0164351
494 Solyc09g010980 Sly09g0082341
7754 Solyc07g065970 Sly07g0132631
4241 Solyc03g122340 Sly03g0227451
4288 Solyc03g123860 Sly03g0228011
3200 Solyc02g088630 Sly02g0345201
Description
4508 Ethylene-responsive transcription factor (AHRD V3.3 *-* W9RA19_9ROSA)
6146 LOW QUALITY:Exocyst subunit EXO70 family protein (AHRD V3.3 *** B9HTX5_POPTR)
5409 dihydrofolate reductase (AHRD V3.3 *** AT4G24380.1)
1420 Kinase family protein (AHRD V3.3 *** U5G1A9_POPTR)
1005 F-box protein PP2 (AHRD V3.3 *** A0A059PC27_CICAR)
910 RING/U-box superfamily protein (AHRD V3.3 *** AT1G72310.1)
8731 P-loop containing nucleoside triphosphate hydrolases superfamily protein (AHRD V3.3 *** AT3G50940.1)
494 LOW QUALITY:cyclin-dependent kinase inhibitor (AHRD V3.3 -** AT5G02220.1)
7754 Chaperone protein DNAj, putative (AHRD V3.3 *** B9SXA3_RICCO)
4241 lipoxygenase D
4288 Receptor-like protein kinase INRPK1c
3200 Hexosyltransferase (AHRD V3.3 *** K4BC18_SOLLC)
GO
4508 GO:0003677|GO:0003700|GO:0006355
6146 GO:0000145|GO:0006887
5409 <NA>
1420 GO:0004672|GO:0005524|GO:0006468|GO:0004672|GO:0006468|GO:0005524
1005 <NA>
910 GO:0005515|GO:0008270
8731 GO:0005524
494 <NA>
7754 <NA>
4241 GO:0005515|GO:0016491|GO:0055114|GO:0016702|GO:0046872|GO:0055114
4288 GO:0004672|GO:0005524|GO:0006468|GO:0004672|GO:0006468|GO:0005515|GO:0005524
3200 GO:0016757|GO:0047262
Name ensembl_gene_id
4508 SlERF.D5 Sly04g0238001
6146 <NA> Sly05g0159891
5409 <NA> Sly04g0251691
1420 <NA> Sly01g0003001
1005 <NA> Sly12g0074121
910 <NA> Sly12g0068931
8731 <NA> Sly10g0164351
494 <NA> Sly09g0082341
7754 <NA> Sly07g0132631
4241 LOX3-like Sly03g0227451
4288 <NA> Sly03g0228011
3200 <NA> Sly02g0345201
[ reached 'max' / getOption("max.print") -- omitted 88 rows ]
For each comparison, we want to see how many genes are detected as DE.
# getExpression('AT2G35980', conds = c('cNF', 'CNF'))
d <- data.frame(matrix(ncol = 4, nrow = 2))
colnames(d) <- names(g)
row.names(d) <- c("up", "down")
for (comp in names(g)) {
d["up", comp] <- sum(g[[comp]]$upreg == 1)
d["down", comp] <- sum(g[[comp]]$upreg == 0)
}
res <- melt(d)
res$reg = rep(c("up", "down"), 4)
genesco2At <- res
save(genesco2At, file = "genesco2At.RData")
ggplot(res, aes(fill = reg, y = value, x = variable)) + geom_bar(position = "stack",
stat = "identity", alpha = 0.5, color = "black") + ggtitle("Carbon dioxide effet on gene regulation") +
xlab("") + ylab("Number of differentially expressed genes") + coord_flip()On s’interroge sur l’effet de la double carence fer et nitrate en CO2 ambiant et élevé. On ne l’avait pas fait car cela fait varer deux facteurs à la fois.
(Intercept) groupcNF
1 1 0
2 1 0
3 1 0
4 1 1
5 1 1
6 1 1
attr(,"assign")
[1] 0 1
attr(,"contrasts")
attr(,"contrasts")$group
[1] "contr.treatment"
[1] "cnf_2" "cnf_1" "cnf_3" "cNF_1" "cNF_3" "cNF_2"
cnf_2 cnf_1 cnf_3 cNF_1 cNF_3 cNF_2
0.9960990 1.0268760 0.9952110 0.9500073 1.0067200 1.0250866
[1] "5641 genes DE"
(Intercept) groupCNF
1 1 0
2 1 0
3 1 0
4 1 1
5 1 1
6 1 1
attr(,"assign")
[1] 0 1
attr(,"contrasts")
attr(,"contrasts")$group
[1] "contr.treatment"
[1] "Cnf_3" "Cnf_2" "Cnf_1" "CNF_1" "CNF_3" "CNF_2"
Cnf_3 Cnf_2 Cnf_1 CNF_1 CNF_3 CNF_2
1.0570683 0.9815650 0.9772254 0.9928072 0.9994200 0.9919141
[1] "8258 genes DE"
doubleInt <- list(`ambiant CO2` = genes5$gene_id, `elevanted CO2` = genes6$gene_id)
ggVennDiagram(doubleInt)[1] "2408not in the Micro-Tom annotation..."
Solyc Sly
4508 Solyc04g012050 Sly04g0238001
6146 Solyc05g054820 Sly05g0159891
5409 Solyc04g074950 Sly04g0251691
1420 Solyc01g007990 Sly01g0003001
1005 Solyc12g096890 Sly12g0074121
910 Solyc12g055710 Sly12g0068931
8731 Solyc10g007280 Sly10g0164351
494 Solyc09g010980 Sly09g0082341
7754 Solyc07g065970 Sly07g0132631
4241 Solyc03g122340 Sly03g0227451
4288 Solyc03g123860 Sly03g0228011
3200 Solyc02g088630 Sly02g0345201
Description
4508 Ethylene-responsive transcription factor (AHRD V3.3 *-* W9RA19_9ROSA)
6146 LOW QUALITY:Exocyst subunit EXO70 family protein (AHRD V3.3 *** B9HTX5_POPTR)
5409 dihydrofolate reductase (AHRD V3.3 *** AT4G24380.1)
1420 Kinase family protein (AHRD V3.3 *** U5G1A9_POPTR)
1005 F-box protein PP2 (AHRD V3.3 *** A0A059PC27_CICAR)
910 RING/U-box superfamily protein (AHRD V3.3 *** AT1G72310.1)
8731 P-loop containing nucleoside triphosphate hydrolases superfamily protein (AHRD V3.3 *** AT3G50940.1)
494 LOW QUALITY:cyclin-dependent kinase inhibitor (AHRD V3.3 -** AT5G02220.1)
7754 Chaperone protein DNAj, putative (AHRD V3.3 *** B9SXA3_RICCO)
4241 lipoxygenase D
4288 Receptor-like protein kinase INRPK1c
3200 Hexosyltransferase (AHRD V3.3 *** K4BC18_SOLLC)
GO
4508 GO:0003677|GO:0003700|GO:0006355
6146 GO:0000145|GO:0006887
5409 <NA>
1420 GO:0004672|GO:0005524|GO:0006468|GO:0004672|GO:0006468|GO:0005524
1005 <NA>
910 GO:0005515|GO:0008270
8731 GO:0005524
494 <NA>
7754 <NA>
4241 GO:0005515|GO:0016491|GO:0055114|GO:0016702|GO:0046872|GO:0055114
4288 GO:0004672|GO:0005524|GO:0006468|GO:0004672|GO:0006468|GO:0005515|GO:0005524
3200 GO:0016757|GO:0047262
Name ensembl_gene_id
4508 SlERF.D5 Sly04g0238001
6146 <NA> Sly05g0159891
5409 <NA> Sly04g0251691
1420 <NA> Sly01g0003001
1005 <NA> Sly12g0074121
910 <NA> Sly12g0068931
8731 <NA> Sly10g0164351
494 <NA> Sly09g0082341
7754 <NA> Sly07g0132631
4241 LOX3-like Sly03g0227451
4288 <NA> Sly03g0228011
3200 <NA> Sly02g0345201
[ reached 'max' / getOption("max.print") -- omitted 1145 rows ]
On trouve énormément de gènes, ce qui est cohérent car le nitrate et le fer pris séparément avaient déjà beaucoup d’effet.
On cherche à savoir si il n’y pas pas un soucis avec l’échantillon CNF, qui semble avoi très très peu de différences avec cNF. Aurait-on envoyé les mêmes tubes?
On ne voit que très peu de DEGs en comparant directement cNF et CNF, on cherche donc à savoir si on trouve le même nombre de DEGs quand on compare cNF - x et CNF -x, ce qui validerait l’hypothèse d’un transcriptôme quasi identique. On choisit ici par exemple \(x=\) cnF.
# res <- c() for(x in c('cnF', 'cnf', 'cNf', 'CnF', 'Cnf', 'CNf')){ labels <-
# c('cNF', x) genes7 <- dualDE(data, labels, pval = 0.01, plot = F) labels <-
# c('CNF', x) genes8 <- dualDE(data, labels, pval = 0.01, plot = F) double <-
# list('ambiant CO2' = genes7$gene_id, 'elevanted CO2' = genes8$gene_id)
# print(ggVennDiagram(double)) partitions <- get.venn.partitions(double) res <-
# c(res, (partitions[2, '..count..']+partitions[3,
# '..count..'])/sum(partitions[,'..count..'])*100) } print('Pourcentage des gènes
# DE qui ne sont pas partagés par les deux comparaisons :') print(res)Il semble qu’il y ait quand même pas mal de différences entre ces transcriptomes, car quand on les compare au même troisième transcriptome on a une différence de 1000 gènes (1900 contre 2800). Le diagramme de Venn montre que ces gènes sont en partie différents. Les transcriptômes ont bien l’air différents. Si on prend un autre x :
# labels <- c('cNF', 'cNf') genes7 <- dualDE(data, labels, pval = 0.01) labels <-
# c('CNF', 'cNf') genes8 <- dualDE(data, labels, pval = 0.01) double <-
# list('ambiant CO2' = genes7$gene_id, 'elevanted CO2' = genes8$gene_id)
# ggVennDiagram(double) OntologyProfile(intersect(genes7$gene_id,
# genes8$gene_id), specie = specie)On va faire les corrélations d’expression entre ces deux conditions.
scatter <- function(df, x, y) {
sp <- ggplot(df, aes(log(df[, x]), log(df[, y]))) + geom_bin2d(bins = 70) + scale_fill_gradient2() +
theme(legend.position = "none") + labs(x = x, y = y)
return(sp)
}
comps <- labels <- c("cNF", "CNF")
diffs <- c()
# correlations between ambient and elevated
for (ambient in colnames(data)[grepl("cNF", colnames(data))]) {
for (elevated in colnames(data)[grepl("CNF", colnames(data))]) {
diffs = c(diffs, cor(data[ambient], data[elevated]))
}
}
diffs_other <- c()
for (ambient in colnames(data)[grepl("cNF", colnames(data))]) {
for (elevated in colnames(data)[!grepl("NF", colnames(data))]) {
diffs_other = c(diffs_other, cor(data[ambient], data[elevated]))
}
}
# correlations within ambien and elevated
sames <- c()
samples <- colnames(data)[grepl("cNF", colnames(data))]
sames <- c(sames, cor(data[, samples[1]], data[, samples[2]]), cor(data[, samples[1]],
data[, samples[3]]), cor(data[, samples[2]], data[, samples[3]]))
samples <- colnames(data)[grepl("CNF", colnames(data))]
sames <- c(sames, cor(data[, samples[1]], data[, samples[2]]), cor(data[, samples[1]],
data[, samples[3]]), cor(data[, samples[2]], data[, samples[3]]))
sames[1] 0.9740289 0.9573153 0.9885952 0.9589331 0.9830163 0.9302603
df <- data.frame(`Same condition` = c(sames, rep(NA, 3)), `Ambient vs Elevated` = diffs)
res <- na.omit(melt(df))
# comparisons
ggplot(data = res, aes(x = variable, y = value, fill = variable)) + geom_boxplot(binaxis = "y",
alpha = 0.4, stackdir = "center") + theme(axis.text.x = element_text(angle = 320,
hjust = 0, colour = "grey50"), plot.title = element_text(size = 14, face = "bold")) +
ggtitle("Correlation for expression accross conditions") + geom_dotplot(binaxis = "y",
stackdir = "center") + stat_compare_means(method = "wilcox.test", hide.ns = FALSE,
label = "p.signif", symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05,
1), symbols = c("****", "***", "**", "*", "ns")))res <- rbind.data.frame(res, data.frame(variable = rep("cNF vs all others except CNF",
length(diffs_other)), value = diffs_other))
ggplot(data = res, aes(x = variable, y = value, fill = variable)) + geom_boxplot(binaxis = "y",
alpha = 0.4, stackdir = "center") + theme(axis.text.x = element_text(angle = 320,
hjust = 0, colour = "grey50"), plot.title = element_text(size = 14, face = "bold")) +
ggtitle("Correlation for expression accross conditions") + geom_dotplot(binaxis = "y",
stackdir = "center") + stat_compare_means(method = "wilcox.test", hide.ns = FALSE,
label = "p.signif", symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05,
1), symbols = c("****", "***", "**", "*", "ns"))) cNF et CNF semblent bien plus proches entre eux que cNF ne l’est avec tous les autres.